This is a toy example of application of Bayesian Optimization. We will use the Bayesian Optimization to find the maximum value of one simple function.
### install.packages("GPfit")
library(GPfit)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(purrr)
We will optimize this simple function:
\[\left(6x-2\right)^2sin\left(12x-4\right)\]
First we define the function in R, then draw the graph of this function in the axes, and generate n0 initial points.
### Generate the function
f <- function(x) {
return((6*x-2)^2*sin(12*x-4))
}
### Draw the graph of this function
x <- seq(0, 1, by=0.0001)
y <- f(x)
graph_data <- data.frame(x, y)
graph_data %>% ggplot() +
geom_point(mapping = aes(x = x, y = y), size = 0.01, color = "navyblue") +
theme_bw()
y_min <- min(y)
min <- 0
for (i in x) {
if(f(i) < min) {
min <- f(i)
x_min <- i
}
}
### Generate the initial points
evaluation <- data.frame("x" = c(0, 1/3, 2/3, 1), "y" = f(c(0, 1/3, 2/3, 1)))
Here we build the initial GP model. We choose the power exponential correlation function here.
fit <- GP_fit(
X = evaluation[, "x"],
Y = evaluation[, "y"],
corr = list(type = "exponential", power = 1.95)
)
x_new <- seq(0, 1, length.out = 100)
pred <- predict.GP(fit, xnew = data.frame(x = x_new))
mu <- pred$Y_hat
sigma <- sqrt(pred$MSE)
pred <- pred %>% data.frame()
pred %>% ggplot() +
geom_line(mapping = aes(x = complete_data.xnew.1, y = Y_hat), color = "gold") +
geom_ribbon(mapping = aes(x = complete_data.xnew.1, y = Y_hat, xmin = 0, xmax = 1, ymin = Y_hat-sqrt(MSE), ymax = Y_hat+sqrt(MSE)), fill = "navyblue", alpha = 0.3) +
geom_point(mapping = aes(x = evaluation[1, "x"], y = evaluation[1, "y"]), color = "navyblue") +
geom_point(mapping = aes(x = evaluation[2, "x"], y = evaluation[2, "y"]), color = "navyblue") +
geom_point(mapping = aes(x = evaluation[3, "x"], y = evaluation[3, "y"]), color = "navyblue") +
geom_point(mapping = aes(x = evaluation[4, "x"], y = evaluation[4, "y"]), color = "navyblue") +
xlab("x") +
ylab("f(x)") +
theme_bw()
Then we will use diferent acquisition functions to find the x to evaluate next.
### y_best is the current best y value
y_best <- min(evaluation[, "y"])
### Make the f(x) - x plot a function
plot <- function(x_next) {
pred %>% ggplot() +
geom_line(mapping = aes(x = complete_data.xnew.1, y = Y_hat), color = "gold") +
geom_ribbon(mapping = aes(x = complete_data.xnew.1, y = Y_hat, xmin = 0, xmax = 1, ymin = Y_hat-sqrt(MSE), ymax = Y_hat+sqrt(MSE)), fill = "navyblue", alpha = 0.3) +
geom_point(mapping = aes(x = evaluation[1, "x"], y = evaluation[1, "y"]), color = "navyblue") +
geom_point(mapping = aes(x = evaluation[2, "x"], y = evaluation[2, "y"]), color = "navyblue") +
geom_point(mapping = aes(x = evaluation[3, "x"], y = evaluation[3, "y"]), color = "navyblue") +
geom_point(mapping = aes(x = evaluation[4, "x"], y = evaluation[4, "y"]), color = "navyblue") +
geom_vline(mapping = aes(xintercept = x_next), color = "red", linetype = "dashed") +
geom_point(mapping = aes(x = x_next, y = Y_hat[ceiling(x_next/0.01)]), color = "red") +
xlab("x") +
ylab("f(x)") +
theme_bw()
}
1.Probability of improvement
probability_improvement <- map2_dbl(
mu,
sigma,
function(m, s) {
if (s == 0) return(0)
else {
poi <- pnorm((y_best - m) / s)
# poi <- 1 - poi (if maximizing)
return(poi)
}
}
)
x_next_POI <- x_new[which.max(probability_improvement)]
ggplot() +
geom_line(mapping = aes(x = x_new, y = probability_improvement), color = "navyblue") +
geom_vline(mapping = aes(xintercept = x_next_POI), color = "red", linetype = "dashed") +
xlab("x") +
theme_bw()
plot(x_next_POI)
expected_improvement <- map2_dbl(
mu, sigma,
function(m, s) {
if (s == 0) return(0)
gamma <- (y_best - m) / s
phi <- pnorm(gamma)
return(s * (gamma * phi + dnorm(gamma)))
}
)
x_next_EI <- x_new[which.max(expected_improvement)]
ggplot() +
geom_line(mapping = aes(x = x_new, y = expected_improvement), color = "navyblue") +
geom_vline(mapping = aes(xintercept = x_next_EI), color = "red", linetype = "dashed") +
xlab("x") +
theme_bw()
plot(x_next_EI)
kappa <- 2 # tunable
lower_confidence_bound <- mu - kappa * sigma
# if maximizing: upper_confidence_bound <- mu + kappa * sigma
### Notice here is min, not max.
x_next_LCB <- x_new[which.min(lower_confidence_bound)]
ggplot() +
geom_line(mapping = aes(x = x_new, y = lower_confidence_bound), color = "navyblue") +
geom_vline(mapping = aes(xintercept = x_next_LCB), color = "red", linetype = "dashed") +
xlab("x") +
theme_bw()
plot(x_next_LCB)
In the last section, we discussed how Bayesian optimization works in one loop. In this section, we will continue looping until we got the optimized value of the simple function.
Firstly we will choose a “good” acquisition function.
This acquisition function measures the likelihood that the next x is better(lower or higher depends on our goal) than the current optimal value.
evaluation_t <- evaluation
t <- 1
data <- data.frame()
data$Y_hat <- c()
data$MSE <- c()
data$iteration <- c()
data$x_x <- c()
data$y_y <- c()
acq_data = data.frame()
while(t <= 10) {
evaluation[nrow(evaluation)+1,] = c(x_next_POI, f(x_next_POI))
fit <- GP_fit(
X = evaluation[, "x"],
Y = evaluation[, "y"],
corr = list(type = "exponential", power = 1.95)
)
x_new <- seq(0, 1, length.out = 100)
pred <- predict.GP(fit, xnew = data.frame(x = x_new))
mu <- pred$Y_hat
sigma <- sqrt(pred$MSE)
pred <- pred %>% data.frame() %>% select(c("Y_hat", "MSE", "complete_data.xnew.1"))
pred <- pred %>% mutate("iteration" = t) %>% mutate("x_x" = NA) %>% mutate("y_y" = NA)
data <- rbind(data, pred)
probability_improvement <- map2_dbl(
mu,
sigma,
function(m, s) {
if (s == 0) return(0)
else {
poi <- pnorm((y_best - m) / s)
# poi <- 1 - poi (if maximizing)
return(poi)
}
}
)
x_next_POI <- x_new[which.max(probability_improvement)]
probability_improvement <- probability_improvement %>% as.data.frame()
colnames(probability_improvement) <- "probability_improvement"
probability_improvement <- probability_improvement %>%
mutate("iteration" = t) %>%
mutate("x_next_POI" = NA)
acq_data <- rbind(acq_data, probability_improvement)
x_next_df <- data.frame(x_next_POI) %>% mutate("probability_improvement" = NA) %>% mutate("iteration" = t)
acq_data <- rbind(acq_data, x_next_df)
t <- t+1
}
t <- 1
nr <- nrow(data)
for (i in 5:14) {
for (j in 1:i) {
data[nr + t,] = c(NA, NA, NA, i-4, evaluation[j, 1], evaluation[j, 2])
t <- t+1
}
}
plot_1 <- data %>% ggplot() +
geom_line(mapping = aes(x = complete_data.xnew.1, y = Y_hat), color = "gold") +
geom_ribbon(mapping = aes(x = complete_data.xnew.1, y = Y_hat, xmin = 0, xmax = 1, ymin = Y_hat-sqrt(MSE), ymax = Y_hat+sqrt(MSE)), fill = "navyblue", alpha = 0.3) +
geom_point(mapping = aes(x = x_x, y = y_y)) +
facet_grid(~iteration) +
xlab("x") +
ylab("f(x)") +
theme_bw()
x_new_df <- rep(c(x_new, NA),10) %>% as.data.frame()
colnames(x_new_df) <- "x_new"
acq_data <- cbind(acq_data, x_new_df)
plot_2 <- acq_data %>% ggplot() +
geom_line(mapping = aes(x = x_new, y = probability_improvement), color = "navyblue") +
geom_vline(mapping = aes(xintercept = x_next_POI), color = "red", linetype = "dashed") +
xlab("x") +
facet_grid(~ iteration) +
theme_bw()
print(plot_1)
## Warning: Removed 14 row(s) containing missing values (geom_path).
## Warning: Removed 1000 rows containing missing values (geom_point).
print(plot_2)
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1000 rows containing missing values (geom_vline).
This acquisition function takes how large the improvement is into account.
evaluation <- evaluation_t
t <- 1
data <- data.frame()
data$Y_hat <- c()
data$MSE <- c()
data$iteration <- c()
data$x_x <- c()
data$y_y <- c()
acq_data = data.frame()
while(t <= 10) {
evaluation[nrow(evaluation)+1,] = c(x_next_EI, f(x_next_EI))
fit <- GP_fit(
X = evaluation[, "x"],
Y = evaluation[, "y"],
corr = list(type = "exponential", power = 1.95)
)
x_new <- seq(0, 1, length.out = 100)
pred <- predict.GP(fit, xnew = data.frame(x = x_new))
mu <- pred$Y_hat
sigma <- sqrt(pred$MSE)
pred <- pred %>% data.frame() %>% select(c("Y_hat", "MSE", "complete_data.xnew.1"))
pred <- pred %>% mutate("iteration" = t) %>% mutate("x_x" = NA) %>% mutate("y_y" = NA)
data <- rbind(data, pred)
expected_improvement <- map2_dbl(
mu, sigma,
function(m, s) {
if (s == 0) return(0)
gamma <- (y_best - m) / s
phi <- pnorm(gamma)
return(s * (gamma * phi + dnorm(gamma)))
}
)
x_next_EI <- x_new[which.max(expected_improvement)]
expected_improvement <- expected_improvement %>% as.data.frame()
colnames(expected_improvement) <- "expected_improvement"
expected_improvement <- expected_improvement %>%
mutate("iteration" = t) %>%
mutate("x_next_EI" = NA)
acq_data <- rbind(acq_data, expected_improvement)
x_next_df <- data.frame(x_next_EI) %>% mutate("expected_improvement" = NA) %>% mutate("iteration" = t)
acq_data <- rbind(acq_data, x_next_df)
t <- t+1
}
t <- 1
nr <- nrow(data)
for (i in 5:14) {
for (j in 1:i) {
data[nr + t,] = c(NA, NA, NA, i-4, evaluation[j, 1], evaluation[j, 2])
t <- t+1
}
}
plot_1 <- data %>% ggplot() +
geom_line(mapping = aes(x = complete_data.xnew.1, y = Y_hat), color = "gold") +
geom_ribbon(mapping = aes(x = complete_data.xnew.1, y = Y_hat, xmin = 0, xmax = 1, ymin = Y_hat-sqrt(MSE), ymax = Y_hat+sqrt(MSE)), fill = "navyblue", alpha = 0.3) +
geom_point(mapping = aes(x = x_x, y = y_y)) +
facet_grid(~iteration) +
xlab("x") +
ylab("f(x)") +
theme_bw()
x_new_df <- rep(c(x_new, NA),10) %>% as.data.frame()
colnames(x_new_df) <- "x_new"
acq_data <- cbind(acq_data, x_new_df)
plot_2 <- acq_data %>% ggplot() +
geom_line(mapping = aes(x = x_new, y = expected_improvement), color = "navyblue") +
geom_vline(mapping = aes(xintercept = x_next_EI), color = "red", linetype = "dashed") +
xlab("x") +
facet_grid(~ iteration) +
theme_bw()
print(plot_1)
## Warning: Removed 14 row(s) containing missing values (geom_path).
## Warning: Removed 1000 rows containing missing values (geom_point).
print(plot_2)
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1000 rows containing missing values (geom_vline).
This acquisition functions balances the area where mean(x) is large and the area where sigma(x) is large. In other words, this acquisition function trades off between exploration and exploitation.
evaluation <- evaluation_t
t <- 1
data <- data.frame()
data$Y_hat <- c()
data$MSE <- c()
data$iteration <- c()
data$x_x <- c()
data$y_y <- c()
acq_data = data.frame()
while(t <= 8) {
evaluation[nrow(evaluation)+1,] = c(x_next_LCB, f(x_next_LCB))
fit <- GP_fit(
X = evaluation[, "x"],
Y = evaluation[, "y"],
corr = list(type = "exponential", power = 1.95)
)
x_new <- seq(0, 1, length.out = 100)
pred <- predict.GP(fit, xnew = data.frame(x = x_new))
mu <- pred$Y_hat
sigma <- sqrt(pred$MSE)
pred <- pred %>% data.frame() %>% select(c("Y_hat", "MSE", "complete_data.xnew.1"))
pred <- pred %>% mutate("iteration" = t) %>% mutate("x_x" = NA) %>% mutate("y_y" = NA)
data <- rbind(data, pred)
lower_confidence_bound <- mu - kappa * sigma
x_next_LCB <- x_new[which.min(lower_confidence_bound)]
lower_confidence_bound <- lower_confidence_bound %>% as.data.frame()
colnames(lower_confidence_bound) <- "lower_confidence_bound"
lower_confidence_bound <- lower_confidence_bound %>%
mutate("iteration" = t) %>%
mutate("x_next_LCB" = NA)
acq_data <- rbind(acq_data, lower_confidence_bound)
x_next_df <- data.frame(x_next_LCB) %>% mutate("lower_confidence_bound" = NA) %>% mutate("iteration" = t)
acq_data <- rbind(acq_data, x_next_df)
t <- t+1
}
t <- 1
nr <- nrow(data)
for (i in 5:12) {
for (j in 1:i) {
data[nr + t,] = c(NA, NA, NA, i-4, evaluation[j, 1], evaluation[j, 2])
t <- t+1
}
}
plot_1 <- data %>% ggplot() +
geom_line(mapping = aes(x = complete_data.xnew.1, y = Y_hat), color = "gold") +
geom_ribbon(mapping = aes(x = complete_data.xnew.1, y = Y_hat, xmin = 0, xmax = 1, ymin = Y_hat-sqrt(MSE), ymax = Y_hat+sqrt(MSE)), fill = "navyblue", alpha = 0.3) +
geom_point(mapping = aes(x = x_x, y = y_y)) +
facet_grid(~iteration) +
xlab("x") +
ylab("f(x)") +
theme_bw()
x_new_df <- rep(c(x_new, NA),8) %>% as.data.frame()
colnames(x_new_df) <- "x_new"
acq_data <- cbind(acq_data, x_new_df)
plot_2 <- acq_data %>% ggplot() +
geom_line(mapping = aes(x = x_new, y = lower_confidence_bound), color = "navyblue") +
geom_vline(mapping = aes(xintercept = x_next_LCB), color = "red", linetype = "dashed") +
xlab("x") +
facet_grid(~ iteration) +
theme_bw()
print(plot_1)
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 800 rows containing missing values (geom_point).
print(plot_2)
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 800 rows containing missing values (geom_vline).
We will use the “expected improvement” acquisition function. Let us assume we have 10 chances to evaluate.
t <- 1
while(t <= 10) {
evaluation[nrow(evaluation)+1,] = c(x_next_EI, f(x_next_EI))
fit <- GP_fit(
X = evaluation[, "x"],
Y = evaluation[, "y"],
corr = list(type = "exponential", power = 1.95)
)
x_new <- seq(0, 1, length.out = 100)
pred <- predict.GP(fit, xnew = data.frame(x = x_new))
mu <- pred$Y_hat
sigma <- sqrt(pred$MSE)
pred <- pred %>% data.frame()
print(ggplot() +
geom_line(mapping = aes(x = pred$complete_data.xnew.1, y = pred$Y_hat), color = "gold") +
geom_ribbon(mapping = aes(x = pred$complete_data.xnew.1, y = pred$Y_hat, xmin = 0, xmax = 1, ymin = pred$Y_hat-sqrt(pred$MSE), ymax = pred$Y_hat+sqrt(pred$MSE)), fill = "navyblue", alpha = 0.3) +
geom_point(mapping = aes(x = evaluation[, "x"], y = evaluation[, "y"]), color = "navyblue") +
xlab("x") +
ylab("f(x)") +
ggtitle(paste("The", t, "times evaluation.")) +
theme_bw())
expected_improvement <- map2_dbl(
mu, sigma,
function(m, s) {
if (s == 0) return(0)
gamma <- (y_best - m) / s
phi <- pnorm(gamma)
return(s * (gamma * phi + dnorm(gamma)))
}
)
x_next_EI <- x_new[which.max(expected_improvement)]
t <- t+1
}
Then we pick the x value that has the minimal f(x) value from what we evaluated.
### Pick the x that has the minimal f(x) value from what we evaluated.
cat("The opmized x we find is", evaluation[which.min(evaluation$y),]$x)
## The opmized x we find is 0.7575758
### Compare it with the real x that minimize f(x) in [0,1]
cat("The real optimized x is", x_min)
## The real optimized x is 0.7572
We can see that we use 10 evaluations to get the x, and the result is pretty close to the real one.
In this section, we will compare the Bayesian optimization with quasi-Newton method. We will try several initial guesses.
### Firstly, we build several equally spaced initial guesses in [0, 1].
ini_guess <- seq(0, 1, length.out = 9)
### We need to change the function to limit the range of x between 0 and 1.
f <- function(x) {
### If x is not in [0, 1], we give function a high value.
### Here we give the range a small "buffer" to make the function continuous in [0, 1].
if (x > 1.001 || x < -0.001) {
return(123)
}
return((6*x-2)^2*sin(12*x-4))
}
### Then we will use quasi-Newton method to try to find the optimal f(x) value.
n=1
for(t in ini_guess) {
res <- optim(t, f, method = "BFGS")
writeLines(paste("The", n, "initial guess x0 = ", t, ":\nx=", res$par,"\nf(x)=", res$value, "\nTimes of iteration=", res$counts[1], "\n-----------------------------------------------"))
n <- n+1
}
## The 1 initial guess x0 = 0 :
## x= 0.142591750809044
## f(x)= -0.98632540532154
## Times of iteration= 42
## -----------------------------------------------
## The 2 initial guess x0 = 0.125 :
## x= 0.142591743095779
## f(x)= -0.98632540532755
## Times of iteration= 36
## -----------------------------------------------
## The 3 initial guess x0 = 0.25 :
## x= 0.14259182798473
## f(x)= -0.986325405260402
## Times of iteration= 34
## -----------------------------------------------
## The 4 initial guess x0 = 0.375 :
## x= 0.142588616315564
## f(x)= -0.986325406271061
## Times of iteration= 27
## -----------------------------------------------
## The 5 initial guess x0 = 0.5 :
## x= 0.14259181984085
## f(x)= -0.986325405266939
## Times of iteration= 38
## -----------------------------------------------
## The 6 initial guess x0 = 0.625 :
## x= 0.757249312241851
## f(x)= -6.02074005560295
## Times of iteration= 23
## -----------------------------------------------
## The 7 initial guess x0 = 0.75 :
## x= 0.757249398105312
## f(x)= -6.02074005554817
## Times of iteration= 13
## -----------------------------------------------
## The 8 initial guess x0 = 0.875 :
## x= 0.757249103094232
## f(x)= -6.02074005570343
## Times of iteration= 24
## -----------------------------------------------
## The 9 initial guess x0 = 1 :
## x= 0.142591622629972
## f(x)= -0.986325405419072
## Times of iteration= 50
## -----------------------------------------------
We can see there are two disadvantages of quasi-Newton method. The first one is we have to choose a “good” initial guess to find the global optimum. In our example, if the initial guess is bigger than 0.5, the quasi-Newton method will find a local optimum.
The second one is it will need more times of iteration to get the optimal value. The minumum number of iteration in our example is 13 times, which is 3 times more than our Bayesian optimization example.
Most of the codes are from this website!!!